Inventory management is vital to a company’s health because it balances supply with demand by ensuring that product is available at the right time by tracking product up and down the supply chain. Too much stock costs money and reduces cash flow and too little stock could lead to unfilled customer orders and lost sales.
This project will produce a Convolutional Neural Network (CNN) which is a deep learning algorithm that can take in an input image, assign importance (learnable weights and biases) to various aspects / objects in the image, and be able to differentiate one from the other. AWS SageMaker will be used to train the CNN model to classify the number of objects in each bin.
# uncomment below to install libraries
# !pip install smdebug
# !pip install altair
# !pip install split-folders
# import libraries
import pandas as pd
import altair as alt
import random
import os
import boto3
from tqdm import tqdm
from PIL import Image
import sagemaker
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner, IntegerParameter
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs, ProfilerConfig, FrameworkProfile
from sagemaker.analytics import HyperparameterTuningJobAnalytics
The cell below creates a folder called train_data, downloads training data and arranges it in subfolders. Each of these subfolders contain images where the number of objects is equal to the name of the folder. For instance, all images in folder 1 has images with 1 object in them. Images are not divided into training, testing or validation sets. If you feel like the number of samples are not enough, you can always download more data (instructions for that can be found here). However, we are not acessing you on the accuracy of your final trained model, but how you create your machine learning engineering pipeline.
import os
import json
import boto3
def download_and_arrange_data():
s3_client = boto3.client('s3')
with open('file_list.json', 'r') as f:
d=json.load(f)
for k, v in d.items():
print(f"Downloading Images with {k} objects")
directory=os.path.join('train_data', k)
if not os.path.exists(directory):
os.makedirs(directory)
for file_path in tqdm(v):
file_name=os.path.basename(file_path).split('.')[0]+'.jpg'
s3_client.download_file('aft-vbi-pds', os.path.join('bin-images', file_name),
os.path.join(directory, file_name))
Run the cell below to download the data to S3.
download_and_arrange_data()
The Amazon Bin Image Dataset contains 5 classes. After downloading the dataset, we will review the distribution of each class.
data_contents = []
# loop over each subfolder in train_data folder
for image_class in range(1,6):
# create temporary dictionary
contents_dict = {}
subfolder = f'train_data/{image_class}'
# count the number of files in each subfolder
num_files = len(os.listdir(subfolder))
contents_dict['image_class'] = image_class
contents_dict['number_of_images'] = num_files
# append dictioanry to list
data_contents.append(contents_dict)
df_data = pd.DataFrame(data_contents)
df_data['percentage'] = round(df_data['number_of_images'] / df_data['number_of_images'].sum(), 2)
df_data
| image_class | number_of_images | percentage | |
|---|---|---|---|
| 0 | 1 | 1228 | 0.12 |
| 1 | 2 | 2299 | 0.22 |
| 2 | 3 | 2666 | 0.26 |
| 3 | 4 | 2373 | 0.23 |
| 4 | 5 | 1875 | 0.18 |
print('Total number of images in dataset are', df_data['number_of_images'].sum())
Total number of images in dataset are 10441
bars = alt.Chart(df_data).mark_bar(size=50).encode(
x=alt.X('image_class:O', title='Image Class'),
y=alt.Y('number_of_images:Q', title='Number of Images', axis=alt.Axis(domain=False))
)
text = bars.mark_text(
align='center',
baseline='bottom',
dx=3,
).encode(
text='number_of_images:Q'
)
(bars+text).properties(
width=400,
title={
'text': ['Distribution of images per class'],
'subtitle': ['Total number of images in dataset are 10,441','\n']
}
).configure_title(fontSize=18, anchor='start')
Let's create a function that takes a subfolder and returns a random image.
# create a function that
def get_random_image(subfolder):
image = random.choice(os.listdir(f'train_data/{subfolder}'))
image_path = os.path.join('train_data', f'{subfolder}', image)
print(f'Sample image returned for image class {subfolder}')
return Image.open(image_path)
get_random_image(1)
Sample image returned for image class 1
get_random_image(2)
Sample image returned for image class 2
get_random_image(3)
Sample image returned for image class 3
get_random_image(4)
Sample image returned for image class 4
get_random_image(5)
Sample image returned for image class 5
The split-folders Python module is a nice package that allows us to take a folder that has subfolders and split the files into train, validation, and test folders. If you have tqdm installed then a progress bar will be displayed when moving files.
import splitfolders
# folder containing training data that needs to be split into separate train, validation, test folders
input_folder = 'train_data'
# name of folder that will contain train, validation, test folders
output_folder = 'data'
splitfolders.ratio(input_folder, output= output_folder, seed=1337, ratio = (0.8, 0.1, 0.1))
data_contents = []
# loop over each subfolder in data folder
for parent_folder in os.listdir('data'):
parent_folder_path = os.listdir(os.path.join('data', parent_folder))
# create temporary dictionary
for subfolder in parent_folder_path:
full_path = os.path.join('data', parent_folder, subfolder)
# print(full_path)
contents_dict = {}
# count the number of files in each subfolder
num_files = len(os.listdir(full_path))
contents_dict['split'] = parent_folder
contents_dict['image_class'] = subfolder
contents_dict['number_of_images'] = num_files
# append dictioanry to list
data_contents.append(contents_dict)
df_data = pd.DataFrame(data_contents)
df_data
| split | image_class | number_of_images | |
|---|---|---|---|
| 0 | test | 4 | 238 |
| 1 | test | 2 | 231 |
| 2 | test | 1 | 124 |
| 3 | test | 5 | 188 |
| 4 | test | 3 | 268 |
| 5 | train | 4 | 1898 |
| 6 | train | 2 | 1839 |
| 7 | train | 1 | 982 |
| 8 | train | 5 | 1500 |
| 9 | train | 3 | 2132 |
| 10 | val | 4 | 237 |
| 11 | val | 2 | 229 |
| 12 | val | 1 | 122 |
| 13 | val | 5 | 187 |
| 14 | val | 3 | 266 |
df_data.groupby('split')['number_of_images'].sum()
split test 1049 train 8351 val 1041 Name: number_of_images, dtype: int64
bars = alt.Chart(df_data).mark_bar().encode(
x=alt.X('split:N', title='Image Class'),
y=alt.Y('number_of_images:Q', title='Number of Images', axis=alt.Axis(domain=False)),
color=alt.Color('split'),
column=alt.Column('image_class:O', title=None)
).properties(
width=80,
title={
'text': ['Distribution of images per class'],
'subtitle': ['Train set images: 8351','Validation set images: 1041', 'Test set images: 1049', '\n']
}
).configure_title(fontSize=18, anchor='start')
bars
# upload data folder to s3 bucket and use --quiet to suppress progress messages
!aws s3 cp data s3://amazon-bin-images/ --recursive --quiet
Let's confirm that the data was upload to S3.
! aws s3 ls s3://amazon-bin-images/
# Initialize the hyperparameters ranges
hyperparameter_ranges = {
"lr": ContinuousParameter(0.001, 0.1),
"batch_size": CategoricalParameter([32, 64]),
"epochs": IntegerParameter(10,20) # train for 10, 15, and 20 epochs
}
role = sagemaker.get_execution_role()
# ensure that the metric definitions match those found in the test function in train.py
# this can be found in line 44 of train.py
objective_metric_name = "Test Loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Test Loss", "Regex": "Testing Loss: ([0-9\\.]+)"}]
# create the training estimator
estimator = PyTorch(
entry_point="hpo.py",
base_job_name='pytorch_inventory_hpo',
role=role,
framework_version="1.4.0",
instance_count=1,
instance_type="ml.g4dn.xlarge",
py_version='py3'
)
# Create the HyperparameterTuner Object
tuner = HyperparameterTuner(
estimator,
objective_metric_name,
hyperparameter_ranges,
metric_definitions,
max_jobs=4,
max_parallel_jobs=2,
objective_type=objective_type,
early_stopping_type="Auto"
)
Select training, test, and validation datasets and fit them to the tuner.
bucket = 'amazon-bin-images'
model_inputs = {"train": f"s3://{bucket}/train/",
"test":f"s3://{bucket}/test/",
"validate": f"s3://{bucket}/val/"}
model_inputs
{'train': 's3://amazon-bin-images/train/',
'test': 's3://amazon-bin-images/test/',
'validate': 's3://amazon-bin-images/val/'}
# Fit the tuner
tuner.fit(model_inputs, wait=True)
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
........................................................................................................................................................................................................................................................................................!
exp = HyperparameterTuningJobAnalytics(
hyperparameter_tuning_job_name=tuner._current_job_name) # get name of latest hpo training job
jobs = exp.dataframe()
jobs.sort_values('FinalObjectiveValue', ascending=0)
| batch_size | epochs | lr | TrainingJobName | TrainingJobStatus | FinalObjectiveValue | TrainingStartTime | TrainingEndTime | TrainingElapsedTimeSeconds | |
|---|---|---|---|---|---|---|---|---|---|
| 3 | "32" | 12.0 | 0.058081 | pytorch-training-221231-0221-001-d0b07d19 | Completed | 1.580604 | 2022-12-31 02:23:28+00:00 | 2022-12-31 02:32:57+00:00 | 569.0 |
| 0 | "32" | 11.0 | 0.005295 | pytorch-training-221231-0221-004-2d45c96a | Completed | 1.546589 | 2022-12-31 02:35:00+00:00 | 2022-12-31 02:39:08+00:00 | 248.0 |
| 1 | "64" | 12.0 | 0.025270 | pytorch-training-221231-0221-003-d81977e4 | Completed | 1.474337 | 2022-12-31 02:33:56+00:00 | 2022-12-31 02:44:10+00:00 | 614.0 |
| 2 | "64" | 20.0 | 0.001467 | pytorch-training-221231-0221-002-cfd6ed7a | Completed | 1.448651 | 2022-12-31 02:23:02+00:00 | 2022-12-31 02:34:46+00:00 | 704.0 |
# Find the best hyperparameters
print(tuner.describe()['HyperParameterTuningJobName'])
best_estimator = tuner.best_estimator()
best_estimator.hyperparameters()
pytorch-training-221231-0221 2022-12-31 02:34:58 Starting - Preparing the instances for training 2022-12-31 02:34:58 Downloading - Downloading input data 2022-12-31 02:34:58 Training - Training image download completed. Training in progress. 2022-12-31 02:34:58 Uploading - Uploading generated training model 2022-12-31 02:34:58 Completed - Resource reused by training job: pytorch-training-221231-0221-004-2d45c96a
{'_tuning_objective_metric': '"Test Loss"',
'batch_size': '"64"',
'epochs': '20',
'lr': '0.0014671136028592893',
'sagemaker_container_log_level': '20',
'sagemaker_estimator_class_name': '"PyTorch"',
'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
'sagemaker_job_name': '"pytorch_inventory_hpo-2022-12-31-02-21-34-777"',
'sagemaker_program': '"hpo.py"',
'sagemaker_region': '"us-east-1"',
'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-468866419413/pytorch_inventory_hpo-2022-12-31-02-21-34-777/source/sourcedir.tar.gz"'}
hyperparameters = {"batch_size": int(best_estimator.hyperparameters()['batch_size'].replace('"', '')), \
"lr": float(best_estimator.hyperparameters()['lr']),
"epochs": int(best_estimator.hyperparameters()['epochs'].replace('"', ''))}
hyperparameters
{'batch_size': 64, 'lr': 0.0014671136028592893, 'epochs': 20}
We will use model debugging and profiling to better monitor and debug the model training job.
# Set up debugging and profiling rules and hooks
from sagemaker.debugger import (
Rule,
DebuggerHookConfig,
rule_configs,
ProfilerRule,
CollectionConfig,
)
rules = [
Rule.sagemaker(rule_configs.vanishing_gradient()),
Rule.sagemaker(rule_configs.overfit()),
Rule.sagemaker(rule_configs.overtraining()),
Rule.sagemaker(rule_configs.poor_weight_initialization()),
ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]
hook_config = DebuggerHookConfig(
hook_parameters={
"train.save_interval": "10",
"eval.save_interval": "1"
}
)
profiler_config = ProfilerConfig(
system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=5)
)
collection_configs=[CollectionConfig(name="CrossEntropyLoss_output_0",parameters={
"include_regex": "CrossEntropyLoss_output_0", "train.save_interval": "10","eval.save_interval": "1"})]
debugger_config=DebuggerHookConfig(collection_configs=collection_configs )
An estimator will be created using the optimal hyperparameters found during tuning.
# Create and fit the estimator
estimator = PyTorch(
entry_point='train_debug_model.py',
base_job_name='pytorch-inventory-estimator',
role=role,
instance_count=1,
instance_type='ml.g4dn.xlarge',
framework_version='1.4.0',
py_version='py3',
hyperparameters=hyperparameters,
## Debugger and Profiler parameters
rules = rules,
debugger_hook_config=hook_config,
profiler_config=profiler_config,
)
estimator.fit(model_inputs, wait=True)
2022-12-31 03:07:25 Starting - Starting the training job... 2022-12-31 03:07:54 Starting - Preparing the instances for trainingVanishingGradient: InProgress Overfit: InProgress Overtraining: InProgress PoorWeightInitialization: InProgress ProfilerReport: InProgress ...... 2022-12-31 03:08:55 Downloading - Downloading input data... 2022-12-31 03:09:15 Training - Downloading the training image...... 2022-12-31 03:10:15 Training - Training image download completed. Training in progress...bash: cannot set terminal process group (-1): Inappropriate ioctl for device bash: no job control in this shell 2022-12-31 03:10:32,421 sagemaker-containers INFO Imported framework sagemaker_pytorch_container.training 2022-12-31 03:10:32,451 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 2022-12-31 03:10:32,454 sagemaker_pytorch_container.training INFO Invoking user training script. 2022-12-31 03:10:32,619 sagemaker-containers INFO Module default_user_module_name does not provide a setup.py. Generating setup.py 2022-12-31 03:10:32,619 sagemaker-containers INFO Generating setup.cfg 2022-12-31 03:10:32,619 sagemaker-containers INFO Generating MANIFEST.in 2022-12-31 03:10:32,619 sagemaker-containers INFO Installing module with the following command: /opt/conda/bin/python3.6 -m pip install . Processing /tmp/tmpvff668ub/module_dir Building wheels for collected packages: default-user-module-name Building wheel for default-user-module-name (setup.py): started Building wheel for default-user-module-name (setup.py): finished with status 'done' Created wheel for default-user-module-name: filename=default_user_module_name-1.0.0-py2.py3-none-any.whl size=8368 sha256=0cac86a4255c54a22c80e7098a1774c8a96e0997797e2b85ed229f195034e6e3 Stored in directory: /tmp/pip-ephem-wheel-cache-fdtnscvv/wheels/2c/ff/5d/5f7b5fc93b8d567250c5b56f1e17c4124b0dbc96c3ea4eb80d Successfully built default-user-module-name Installing collected packages: default-user-module-name Successfully installed default-user-module-name-1.0.0 2022-12-31 03:10:34,699 sagemaker-containers INFO Invoking user script Training Env: { "additional_framework_parameters": {}, "channel_input_dirs": { "test": "/opt/ml/input/data/test", "train": "/opt/ml/input/data/train", "validate": "/opt/ml/input/data/validate" }, "current_host": "algo-1", "framework_module": "sagemaker_pytorch_container.training:main", "hosts": [ "algo-1" ], "hyperparameters": { "batch_size": 64, "epochs": 20, "lr": 0.0014671136028592893 }, "input_config_dir": "/opt/ml/input/config", "input_data_config": { "test": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None" }, "train": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None" }, "validate": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None" } }, "input_dir": "/opt/ml/input", "is_master": true, "job_name": "pytorch-inventory-estimator-2022-12-31-03-07-24-661", "log_level": 20, "master_hostname": "algo-1", "model_dir": "/opt/ml/model", "module_dir": "s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/source/sourcedir.tar.gz", "module_name": "train_debug_model", "network_interface_name": "eth0", "num_cpus": 4, "num_gpus": 1, "output_data_dir": "/opt/ml/output/data", "output_dir": "/opt/ml/output", "output_intermediate_dir": "/opt/ml/output/intermediate", "resource_config": { "current_host": "algo-1", "current_instance_type": "ml.g4dn.xlarge", "current_group_name": "homogeneousCluster", "hosts": [ "algo-1" ], "instance_groups": [ { "instance_group_name": "homogeneousCluster", "instance_type": "ml.g4dn.xlarge", "hosts": [ "algo-1" ] } ], "network_interface_name": "eth0" }, "user_entry_point": "train_debug_model.py" } Environment variables: SM_HOSTS=["algo-1"] SM_NETWORK_INTERFACE_NAME=eth0 SM_HPS={"batch_size":64,"epochs":20,"lr":0.0014671136028592893} SM_USER_ENTRY_POINT=train_debug_model.py SM_FRAMEWORK_PARAMS={} SM_RESOURCE_CONFIG={"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.xlarge"}],"network_interface_name":"eth0"} SM_INPUT_DATA_CONFIG={"test":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"},"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"},"validate":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}} SM_OUTPUT_DATA_DIR=/opt/ml/output/data SM_CHANNELS=["test","train","validate"] SM_CURRENT_HOST=algo-1 SM_MODULE_NAME=train_debug_model SM_LOG_LEVEL=20 SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main SM_INPUT_DIR=/opt/ml/input SM_INPUT_CONFIG_DIR=/opt/ml/input/config SM_OUTPUT_DIR=/opt/ml/output SM_NUM_CPUS=4 SM_NUM_GPUS=1 SM_MODEL_DIR=/opt/ml/model SM_MODULE_DIR=s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/source/sourcedir.tar.gz SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"test":"/opt/ml/input/data/test","train":"/opt/ml/input/data/train","validate":"/opt/ml/input/data/validate"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"batch_size":64,"epochs":20,"lr":0.0014671136028592893},"input_config_dir":"/opt/ml/input/config","input_data_config":{"test":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"},"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"},"validate":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"pytorch-inventory-estimator-2022-12-31-03-07-24-661","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/source/sourcedir.tar.gz","module_name":"train_debug_model","network_interface_name":"eth0","num_cpus":4,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.xlarge"}],"network_interface_name":"eth0"},"user_entry_point":"train_debug_model.py"} SM_USER_ARGS=["--batch_size","64","--epochs","20","--lr","0.0014671136028592893"] SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate SM_CHANNEL_TEST=/opt/ml/input/data/test SM_CHANNEL_TRAIN=/opt/ml/input/data/train SM_CHANNEL_VALIDATE=/opt/ml/input/data/validate SM_HP_BATCH_SIZE=64 SM_HP_EPOCHS=20 SM_HP_LR=0.0014671136028592893 PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages Invoking script with the following command: /opt/conda/bin/python3.6 train_debug_model.py --batch_size 64 --epochs 20 --lr 0.0014671136028592893 Running on Device cuda:0 Hyperparameters : LR: 0.0014671136028592893, Batch Size: 64, Epoch: 20 Model Dir Path: /opt/ml/model Output Dir Path: /opt/ml/output/data [2022-12-31 03:10:38.110 algo-1:46 INFO json_config.py:90] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json. [2022-12-31 03:10:38.111 algo-1:46 INFO hook.py:192] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries. [2022-12-31 03:10:38.111 algo-1:46 INFO hook.py:237] Saving to /opt/ml/output/tensors [2022-12-31 03:10:38.111 algo-1:46 INFO state_store.py:67] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist. Get train and test data loader Begin Training Start Training Epoch 0, Phase train [2022-12-31 03:10:38.712 algo-1:46 INFO hook.py:382] Monitoring the collections: losses, gradients, relu_input [2022-12-31 03:10:38.712 algo-1:46 INFO hook.py:443] Hook is writing from the hook with pid: 46 valid loss: 90.7028, acc: 18.0000, best loss: 90.7028 Epoch 1, Phase train train loss: 93.0653, acc: 19.0000, best loss: 90.7028 Epoch 1, Phase valid valid loss: 89.9707, acc: 18.0000, best loss: 89.9707 Epoch 2, Phase train train loss: 92.3530, acc: 20.0000, best loss: 89.9707 Epoch 2, Phase valid valid loss: 88.2683, acc: 18.0000, best loss: 88.2683 Epoch 3, Phase train VanishingGradient: InProgress Overfit: InProgress Overtraining: InProgress PoorWeightInitialization: Error train loss: 91.2974, acc: 20.0000, best loss: 88.2683 Epoch 3, Phase valid valid loss: 88.8757, acc: 18.0000, best loss: 88.2683 Begin Testing Start Testing Testing Loss: 1.4615891885484709 Testing Accuracy: 0.30505243088655865 Saving The Model Model Saved 2022-12-31 03:17:48,999 sagemaker-containers INFO Reporting training SUCCESS 2022-12-31 03:18:19 Uploading - Uploading generated training model 2022-12-31 03:18:19 Completed - Training job completed Training seconds: 580 Billable seconds: 580
From the output above we can see that the testing accuracy was 30% during model training.
training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
Training jobname: pytorch-inventory-estimator-2022-12-31-03-07-24-661
pd.DataFrame(estimator.latest_training_job.rule_job_summary())
| RuleConfigurationName | RuleEvaluationJobArn | RuleEvaluationStatus | LastModifiedTime | StatusDetails | |
|---|---|---|---|---|---|
| 0 | VanishingGradient | arn:aws:sagemaker:us-east-1:468866419413:proce... | NoIssuesFound | 2022-12-31 03:18:39.182000+00:00 | NaN |
| 1 | Overfit | arn:aws:sagemaker:us-east-1:468866419413:proce... | NoIssuesFound | 2022-12-31 03:18:39.182000+00:00 | NaN |
| 2 | Overtraining | arn:aws:sagemaker:us-east-1:468866419413:proce... | NoIssuesFound | 2022-12-31 03:18:39.182000+00:00 | NaN |
| 3 | PoorWeightInitialization | arn:aws:sagemaker:us-east-1:468866419413:proce... | Error | 2022-12-31 03:18:39.182000+00:00 | InternalServerError: We encountered an interna... |
| 4 | ProfilerReport | arn:aws:sagemaker:us-east-1:468866419413:proce... | NoIssuesFound | 2022-12-31 03:18:33.180000+00:00 | NaN |
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys
trial = create_trial(estimator.latest_job_debugger_artifacts_path())
print(trial.tensor_names())
print(len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.TRAIN)))
print(len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.EVAL)))
[2022-12-31 03:32:46.994 datascience-1-0-ml-m5-large-ab1c8cfd3029fb39f3ca96edc853:19 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/debug-output
[2022-12-31 03:32:47.213 datascience-1-0-ml-m5-large-ab1c8cfd3029fb39f3ca96edc853:19 WARNING s3handler.py:183] Encountered the exception An error occurred while reading from response stream: ('Connection broken: IncompleteRead(0 bytes read, 5111 more expected)', IncompleteRead(0 bytes read, 5111 more expected)) while reading s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/debug-output/index/000000000/000000000090_worker_0.json . Will retry now
[2022-12-31 03:32:49.653 datascience-1-0-ml-m5-large-ab1c8cfd3029fb39f3ca96edc853:19 INFO trial.py:198] Training has ended, will refresh one final time in 1 sec.
[2022-12-31 03:32:50.693 datascience-1-0-ml-m5-large-ab1c8cfd3029fb39f3ca96edc853:19 INFO trial.py:210] Loaded all steps
['CrossEntropyLoss_output_0', 'gradient/ResNet_fc.0.bias', 'gradient/ResNet_fc.0.weight', 'gradient/ResNet_fc.1.bias', 'gradient/ResNet_fc.1.weight', 'gradient/ResNet_fc.3.bias', 'gradient/ResNet_fc.3.weight', 'gradient/ResNet_fc.5.bias', 'gradient/ResNet_fc.5.weight', 'layer1.0.relu_input_0', 'layer1.0.relu_input_1', 'layer1.0.relu_input_2', 'layer1.1.relu_input_0', 'layer1.1.relu_input_1', 'layer1.1.relu_input_2', 'layer1.2.relu_input_0', 'layer1.2.relu_input_1', 'layer1.2.relu_input_2', 'layer2.0.relu_input_0', 'layer2.0.relu_input_1', 'layer2.0.relu_input_2', 'layer2.1.relu_input_0', 'layer2.1.relu_input_1', 'layer2.1.relu_input_2', 'layer2.2.relu_input_0', 'layer2.2.relu_input_1', 'layer2.2.relu_input_2', 'layer2.3.relu_input_0', 'layer2.3.relu_input_1', 'layer2.3.relu_input_2', 'layer3.0.relu_input_0', 'layer3.0.relu_input_1', 'layer3.0.relu_input_2', 'layer3.1.relu_input_0', 'layer3.1.relu_input_1', 'layer3.1.relu_input_2', 'layer3.2.relu_input_0', 'layer3.2.relu_input_1', 'layer3.2.relu_input_2', 'layer3.3.relu_input_0', 'layer3.3.relu_input_1', 'layer3.3.relu_input_2', 'layer3.4.relu_input_0', 'layer3.4.relu_input_1', 'layer3.4.relu_input_2', 'layer3.5.relu_input_0', 'layer3.5.relu_input_1', 'layer3.5.relu_input_2', 'layer4.0.relu_input_0', 'layer4.0.relu_input_1', 'layer4.0.relu_input_2', 'layer4.1.relu_input_0', 'layer4.1.relu_input_1', 'layer4.1.relu_input_2', 'layer4.2.relu_input_0', 'layer4.2.relu_input_1', 'layer4.2.relu_input_2', 'relu_input_0']
53
85
def get_data(trial, tname, mode):
tensor = trial.tensor(tname)
steps = tensor.steps(mode=mode)
vals = []
for s in steps:
vals.append(tensor.value(s, mode=mode))
return steps, vals
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot
def plot_tensor(trial, tensor_name):
steps_train, vals_train = get_data(trial, tensor_name, mode=ModeKeys.TRAIN)
print("loaded TRAIN data")
steps_eval, vals_eval = get_data(trial, tensor_name, mode=ModeKeys.EVAL)
print("loaded EVAL data")
fig = plt.figure(figsize=(10, 7))
host = host_subplot(111)
par = host.twiny()
host.set_xlabel("Steps (TRAIN)")
par.set_xlabel("Steps (EVAL)")
host.set_ylabel(tensor_name)
(p1,) = host.plot(steps_train, vals_train, label=tensor_name)
print("completed TRAIN plot")
(p2,) = par.plot(steps_eval, vals_eval, label="val_" + tensor_name)
print("completed EVAL plot")
leg = plt.legend()
host.xaxis.get_label().set_color(p1.get_color())
leg.texts[0].set_color(p1.get_color())
par.xaxis.get_label().set_color(p2.get_color())
leg.texts[1].set_color(p2.get_color())
plt.ylabel(tensor_name)
plt.show()
plot_tensor(trial, "CrossEntropyLoss_output_0")
loaded TRAIN data loaded EVAL data completed TRAIN plot completed EVAL plot
Cross entropy loss for the training phase remains rather stable throughout training. The validation phase has its cross entropy loss start high, then dive low, and then stabilize somewhat but the range of values are larger than the cross entropy loss for the training phase.
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
rule_output_path
's3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output'
Download the profiler report.
! aws s3 cp {rule_output_path} ./ --recursive
download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json to ProfilerReport/profiler-output/profiler-reports/BatchSize.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to ProfilerReport/profiler-output/profiler-reports/Dataloader.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/MaxInitializationTime.json to ProfilerReport/profiler-output/profiler-reports/MaxInitializationTime.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json to ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-report.html to ProfilerReport/profiler-output/profiler-report.html download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb to ProfilerReport/profiler-output/profiler-report.ipynb download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/IOBottleneck.json to ProfilerReport/profiler-output/profiler-reports/IOBottleneck.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.json to ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json to ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/OverallFrameworkMetrics.json to ProfilerReport/profiler-output/profiler-reports/OverallFrameworkMetrics.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/LoadBalancing.json to ProfilerReport/profiler-output/profiler-reports/LoadBalancing.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/StepOutlier.json to ProfilerReport/profiler-output/profiler-reports/StepOutlier.json download: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/rule-output/ProfilerReport/profiler-output/profiler-reports/OverallSystemUsage.json to ProfilerReport/profiler-output/profiler-reports/OverallSystemUsage.json
# get the autogenerated folder name of profiler report
profiler_report_name = [
rule["RuleConfigurationName"]
for rule in estimator.latest_training_job.rule_job_summary()
if "Profiler" in rule["RuleConfigurationName"]
][0]
Run the cell below to display the profiler report.
import IPython
IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")
SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive.
Legal disclaimer: This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information.
# Parameters
processing_job_arn = "arn:aws:sagemaker:us-east-1:468866419413:processing-job/pytorch-inventory-estimato-profilerreport-994474c3"
# get the location of our model
model_path = estimator.output_path + estimator.latest_training_job.job_name + "/output/model.tar.gz"
print(f"Model: {model_path}")
Model: s3://sagemaker-us-east-1-468866419413/pytorch-inventory-estimator-2022-12-31-03-07-24-661/output/model.tar.gz
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()
class ImagePredictor(Predictor):
def __init__(self, endpoint_name, sagemaker_session):
super(ImagePredictor, self).__init__(
endpoint_name,
sagemaker_session=sagemaker_session,
serializer=jpeg_serializer,
deserializer=json_deserializer,
)
pytorch_model = PyTorchModel(model_data=model_path,
role=role,
entry_point='inference.py',
py_version='py36',
framework_version='1.8',
predictor_cls=ImagePredictor)
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type="ml.m5.large")
------!
import numpy as np
import io
def predict_image_label(dataset, subfolder):
image = random.choice(os.listdir(f'data/{dataset}/{subfolder}'))
image_path = os.path.join(f'data/{dataset}', f'{subfolder}', image)
with open(image_path, 'rb') as f:
payload = f.read()
# print(f'Sample image selected for image class: {subfolder}')
# # generate prediction for image
response = predictor.predict(payload, initial_args={"ContentType": "image/jpeg"})
print(f'Image path: {image_path}')
print(f'Image label: {subfolder}')
predicted_label = np.argmax(response, 1)[0]
print(f'Predicted label: {predicted_label}')
# print(response)
return display(Image.open(io.BytesIO(payload)))
predict_image_label('test', 1)
Image path: data/test/1/08945.jpg Image label: 1 Predicted label: 0
predict_image_label('test', 2)
Image path: data/test/2/01623.jpg Image label: 2 Predicted label: 0
predict_image_label('test', 3)
Image path: data/test/3/01663.jpg Image label: 3 Predicted label: 2
predict_image_label('test', 4)
Image path: data/test/4/100801.jpg Image label: 4 Predicted label: 2
predict_image_label('test', 5)
Image path: data/test/5/100403.jpg Image label: 5 Predicted label: 2
When we’re done with the endpoint, we can just delete it and the backing instances will be released. Run the following cell to delete the endpoint.
predictor.delete_endpoint()